%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import random
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import make_classification
df = pd.DataFrame.from_csv('data/advertising.csv')
df.head()
pub_domain | user_id | viewed | ctr | click | |
---|---|---|---|---|---|
1000 | 1000 | 1410160 | True | 0.463089 | True |
1000 | 1000 | 1157972 | True | 0.463089 | False |
1000 | 1000 | 1061623 | True | 0.463089 | True |
1001 | 1001 | 1169015 | True | 0.303266 | True |
1001 | 1001 | 1111880 | True | 0.303266 | False |
df_pub = df[['viewed', 'click', 'pub_domain']].groupby('pub_domain').aggregate(sum)
df_pub['nImps'] = df.pub_domain.value_counts()
df_pub.columns = [['vImps', 'clicks', 'nImps']]
df_pub['CTR'] = df_pub.clicks / df_pub.nImps
df_pub['reach'] = df[['pub_domain', 'user_id']].groupby('pub_domain').aggregate(lambda x: len(x.unique()))
df_pub['vReach'] = df[df.viewed == True][['pub_domain', 'user_id']].groupby('pub_domain').aggregate(lambda x: int(len(x.unique())))
df_pub['vRate'] = (df_pub.vImps / df_pub.nImps) * [random.random() for x in range(len(df_pub))] * df_pub.CTR
df_pub['reachRate'] = df_pub.reach / df_pub.nImps
df_pub['vReachRate'] = df_pub.vReach / df_pub.nImps
df_pub['meanViewTime'] = [60 * random.random() * x for x in df_pub.CTR]
df_pub['exposure'] = df_pub.vImps * df_pub.meanViewTime
df_pub = df_pub.fillna(0)
nImps = len(df)
nPubs = len(df.pub_domain.unique())
nUsers = len(df.user_id.unique())
print('nImps={}\nnPubs={}\nnUsers={}'.format(nImps, nPubs, nUsers))
nImps=1000000 nPubs=3993 nUsers=377464
df2 = df[df.click == True]
nImps2 = len(df2)
nPubs2 = len(df2.pub_domain.unique())
nUsers2 = len(df2.user_id.unique())
print('nImps2={}\nnPubs2={}\nnUsers2={}'.format(nImps2, nPubs2, nUsers2))
nImps2=161334 nPubs2=3920 nUsers2=131535
nClicks = df.click.value_counts()[True]
print('nClicks={} ({}%)'.format(nClicks, round(float(nClicks) * 100 / nImps, 2)))
nClicks=161334 (16.13%)
nViews = df.viewed.value_counts()[True]
print('nViews={} ({}%)'.format(nViews, round(float(nViews) * 100 / nImps, 2)))
nViews=800306 (80.03%)
uimsz = (nPubs * nUsers) # all visits
uimsz2 = (nPubs2 * nUsers2) # clicks
print('uim, all imps: {}, clicks only: {}'. format(uimsz, uimsz2))
uim, all imps: 1507213752, clicks only: 515617200
f = df.groupby('pub_domain').size()
sns.distplot(np.log10(f));
df.head()
pub_domain | user_id | viewed | ctr | click | |
---|---|---|---|---|---|
1000 | 1000 | 1410160 | True | 0.463089 | True |
1000 | 1000 | 1157972 | True | 0.463089 | False |
1000 | 1000 | 1061623 | True | 0.463089 | True |
1001 | 1001 | 1169015 | True | 0.303266 | True |
1001 | 1001 | 1111880 | True | 0.303266 | False |
_ = plt.hist(df.pub_domain.value_counts(), bins=range(2,100,1))
df.set_index(df.pub_domain, inplace=True)
df5, df6 = train_test_split(df, test_size = 0.2)
users = df5.user_id.unique()
pubs = df5.pub_domain.unique()
user_idx, pub_idx = {}, {}
for i in range(len(users)):
user_idx[users[i]] = i
for i in range(len(pubs)):
pub_idx[pubs[i]] = i
pidx = {b: a for a, b in pub_idx.items()} # for reverse lookup pub_domain = pidx[integer_key]
def makeUIM(df):
dframe = df.sort_values(by='user_id', axis=0)
nUsers = len(user_idx)
nPubs = len(dframe.pub_domain.unique())
V = sp.lil_matrix((nUsers, nPubs))
def matput(imp):
try:
if imp.viewed:
V[user_idx[imp.user_id], pub_idx[imp.pub_domain]] = 1
except:
pass
_ = dframe[dframe.click == True].apply(matput, axis=1)
return V
df5 = df5.sort_values(by='user_id', axis=0)
V = makeUIM(df5)
VT = V.T
df6 = df6.sort_values(by='user_id', axis=0)
TT = makeUIM(df6).T
u, s, vt = svds(V, k = 400)
plt.plot(s[::-1])
[<matplotlib.lines.Line2D at 0x114d42850>]
ts0 = df6.pub_domain.value_counts()
tsums = pd.DataFrame.from_dict({
'pub_domain': ts0.index,
'nImps': ts0,
'clicks': df6.groupby('pub_domain')['click'].aggregate(sum)
})
tsums.reset_index(inplace=True)
tsums['CTR'] = tsums.clicks / np.float64(tsums.nImps)
ts1 = df.pub_domain.value_counts()
pubsums = pd.DataFrame.from_dict({
'pub_domain': ts1.index,
'nImps': ts1,
'clicks': df.groupby('pub_domain')['click'].aggregate(sum)
})
pubsums['CTR'] = pubsums.clicks / np.float64(pubsums.nImps)
tt = TT.todense()
tt = np.asarray(tt)
plt.rcParams["figure.figsize"] = [12.0, 4.0]
kMin = 1
kMax = 10
weightFunctions = {
'f1': lambda x: [1 for i in range(len(x))],
'f2': lambda x: 1 / x,
'f3': lambda x: 1 / x ** 2
}
for idx, f in enumerate(weightFunctions):
print('weight function{}'.format(f))
rmseL = []
wf = weightFunctions[f]
for nNeighbors in range(kMin, kMax):
neigh = NearestNeighbors(nNeighbors)
neigh.fit(VT)
act = pd.Series()
pred = pd.Series()
for i in range(TT.shape[0]):
d = neigh.kneighbors(tt[i,:].reshape(1, -1), return_distance=True)
W = pd.Series([v for v in d[0][0]])
y = pd.Series(pubsums.iloc[d[1][0]].CTR)
a0 = tsums.iloc[i].CTR
try:
p0 = np.average(y[y.values > 0], weights = wf(W[y.values > 0]) )
except:
p0 = 0
act = act.append(pd.Series(a0))
pred = pred.append(pd.Series(p0))
exit
rmse = ((act.sub(pred).pow(2).mean()) ** .5 / (act.max() - act.min())) * 100
rmseL.append(rmse)
print('{}. {}'.format(nNeighbors, rmse))
plt0 = plt.subplot(130+idx+1)
plt0.plot(range(kMin, kMax ,1), rmseL)
plt.tight_layout(pad=2.0)
exit
weight functionf1 1. 16.0163810193 2. 13.1859662817 3. 12.9689393099 4. 16.680206999 5. 14.6384491895 6. 14.0660686046 7. 14.0650293768 8. 13.4824095333 9. 13.942187028 weight functionf2 1. 15.2772888351 2. 12.3886556128 3. 12.1874934405 4. 16.194856847 5. 14.0479046742 6. 13.4458243001 7. 13.4445755208 8. 12.8089387628 9. 13.3148983652 weight functionf3 1. 15.2772888351 2. 12.3886674674 3. 12.1874643295 4. 16.1946693787 5. 14.0477756095 6. 13.4457115246 7. 13.444468316 8. 12.8088199565 9. 13.3148077251
df_pub
vImps | clicks | nImps | CTR | reach | vReach | vRate | reachRate | vReachRate | meanViewTime | exposure | |
---|---|---|---|---|---|---|---|---|---|---|---|
pub_domain | |||||||||||
1000 | 3.0 | 2.0 | 3 | 0.666667 | 3 | 3.0 | 0.275262 | 1.000000 | 1.000000 | 33.520006 | 100.560017 |
1001 | 12.0 | 3.0 | 13 | 0.230769 | 13 | 12.0 | 0.122078 | 1.000000 | 0.923077 | 3.529025 | 42.348299 |
1002 | 8.0 | 5.0 | 11 | 0.454545 | 11 | 8.0 | 0.118468 | 1.000000 | 0.727273 | 10.604967 | 84.839733 |
1003 | 23.0 | 0.0 | 26 | 0.000000 | 26 | 23.0 | 0.000000 | 1.000000 | 0.884615 | 0.000000 | 0.000000 |
1004 | 15.0 | 7.0 | 22 | 0.318182 | 22 | 15.0 | 0.020841 | 1.000000 | 0.681818 | 5.079108 | 76.186626 |
1005 | 30.0 | 10.0 | 40 | 0.250000 | 40 | 30.0 | 0.071003 | 1.000000 | 0.750000 | 1.627720 | 48.831589 |
1006 | 24.0 | 6.0 | 29 | 0.206897 | 29 | 24.0 | 0.084437 | 1.000000 | 0.827586 | 7.496946 | 179.926693 |
1007 | 28.0 | 8.0 | 34 | 0.235294 | 34 | 28.0 | 0.091125 | 1.000000 | 0.823529 | 4.873371 | 136.454401 |
1008 | 32.0 | 6.0 | 36 | 0.166667 | 36 | 32.0 | 0.090447 | 1.000000 | 0.888889 | 9.077761 | 290.488350 |
1009 | 42.0 | 2.0 | 58 | 0.034483 | 58 | 42.0 | 0.022988 | 1.000000 | 0.724138 | 0.627032 | 26.335348 |
1010 | 47.0 | 4.0 | 57 | 0.070175 | 57 | 47.0 | 0.004582 | 1.000000 | 0.824561 | 1.474816 | 69.316332 |
1011 | 41.0 | 2.0 | 51 | 0.039216 | 51 | 41.0 | 0.000835 | 1.000000 | 0.803922 | 1.736893 | 71.212603 |
1012 | 39.0 | 12.0 | 47 | 0.255319 | 47 | 39.0 | 0.058998 | 1.000000 | 0.829787 | 7.929435 | 309.247953 |
1013 | 51.0 | 10.0 | 63 | 0.158730 | 63 | 51.0 | 0.022737 | 1.000000 | 0.809524 | 3.495873 | 178.289529 |
1014 | 52.0 | 3.0 | 64 | 0.046875 | 64 | 52.0 | 0.036995 | 1.000000 | 0.812500 | 1.926381 | 100.171790 |
1015 | 51.0 | 14.0 | 66 | 0.212121 | 66 | 51.0 | 0.151345 | 1.000000 | 0.772727 | 6.375563 | 325.153704 |
1016 | 61.0 | 0.0 | 73 | 0.000000 | 73 | 61.0 | 0.000000 | 1.000000 | 0.835616 | 0.000000 | 0.000000 |
1017 | 70.0 | 28.0 | 88 | 0.318182 | 87 | 69.0 | 0.060775 | 0.988636 | 0.784091 | 2.140038 | 149.802684 |
1018 | 64.0 | 11.0 | 80 | 0.137500 | 80 | 64.0 | 0.076433 | 1.000000 | 0.800000 | 7.370584 | 471.717395 |
1019 | 80.0 | 13.0 | 103 | 0.126214 | 103 | 80.0 | 0.052028 | 1.000000 | 0.776699 | 1.041808 | 83.344671 |
1020 | 78.0 | 4.0 | 100 | 0.040000 | 100 | 78.0 | 0.015776 | 1.000000 | 0.780000 | 2.303285 | 179.656253 |
1021 | 88.0 | 29.0 | 106 | 0.273585 | 106 | 88.0 | 0.126739 | 1.000000 | 0.830189 | 2.302842 | 202.650140 |
1022 | 81.0 | 10.0 | 105 | 0.095238 | 105 | 81.0 | 0.004029 | 1.000000 | 0.771429 | 3.049524 | 247.011468 |
1023 | 98.0 | 10.0 | 119 | 0.084034 | 119 | 98.0 | 0.056959 | 1.000000 | 0.823529 | 4.639320 | 454.653313 |
1024 | 98.0 | 23.0 | 124 | 0.185484 | 124 | 98.0 | 0.033021 | 1.000000 | 0.790323 | 1.170400 | 114.699215 |
1025 | 90.0 | 50.0 | 119 | 0.420168 | 119 | 90.0 | 0.239107 | 1.000000 | 0.756303 | 3.018353 | 271.651810 |
1026 | 108.0 | 34.0 | 129 | 0.263566 | 129 | 108.0 | 0.093460 | 1.000000 | 0.837209 | 15.454402 | 1669.075431 |
1027 | 116.0 | 73.0 | 141 | 0.517730 | 141 | 116.0 | 0.211257 | 1.000000 | 0.822695 | 14.865386 | 1724.384743 |
1028 | 110.0 | 17.0 | 140 | 0.121429 | 140 | 110.0 | 0.024198 | 1.000000 | 0.785714 | 1.399765 | 153.974098 |
1029 | 120.0 | 13.0 | 152 | 0.085526 | 152 | 120.0 | 0.059837 | 1.000000 | 0.789474 | 0.058177 | 6.981241 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4963 | 3.0 | 0.0 | 5 | 0.000000 | 5 | 3.0 | 0.000000 | 1.000000 | 0.600000 | 0.000000 | 0.000000 |
4964 | 3.0 | 1.0 | 4 | 0.250000 | 4 | 3.0 | 0.168539 | 1.000000 | 0.750000 | 2.949026 | 8.847079 |
4965 | 1.0 | 1.0 | 2 | 0.500000 | 2 | 1.0 | 0.004760 | 1.000000 | 0.500000 | 29.457369 | 29.457369 |
4966 | 2.0 | 1.0 | 4 | 0.250000 | 4 | 2.0 | 0.025950 | 1.000000 | 0.500000 | 9.397525 | 18.795049 |
4967 | 1.0 | 0.0 | 2 | 0.000000 | 2 | 1.0 | 0.000000 | 1.000000 | 0.500000 | 0.000000 | 0.000000 |
4968 | 5.0 | 1.0 | 5 | 0.200000 | 5 | 5.0 | 0.114428 | 1.000000 | 1.000000 | 0.044619 | 0.223097 |
4969 | 5.0 | 0.0 | 5 | 0.000000 | 5 | 5.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4970 | 1.0 | 0.0 | 1 | 0.000000 | 1 | 1.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4971 | 1.0 | 0.0 | 1 | 0.000000 | 1 | 1.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4972 | 5.0 | 0.0 | 6 | 0.000000 | 6 | 5.0 | 0.000000 | 1.000000 | 0.833333 | 0.000000 | 0.000000 |
4973 | 2.0 | 0.0 | 2 | 0.000000 | 2 | 2.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4974 | 1.0 | 2.0 | 2 | 1.000000 | 2 | 1.0 | 0.005310 | 1.000000 | 0.500000 | 52.390245 | 52.390245 |
4975 | 2.0 | 0.0 | 2 | 0.000000 | 2 | 2.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4976 | 2.0 | 0.0 | 2 | 0.000000 | 2 | 2.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4977 | 2.0 | 0.0 | 4 | 0.000000 | 4 | 2.0 | 0.000000 | 1.000000 | 0.500000 | 0.000000 | 0.000000 |
4978 | 4.0 | 0.0 | 5 | 0.000000 | 5 | 4.0 | 0.000000 | 1.000000 | 0.800000 | 0.000000 | 0.000000 |
4979 | 1.0 | 1.0 | 4 | 0.250000 | 4 | 1.0 | 0.017988 | 1.000000 | 0.250000 | 5.202126 | 5.202126 |
4980 | 2.0 | 2.0 | 2 | 1.000000 | 2 | 2.0 | 0.936463 | 1.000000 | 1.000000 | 59.367776 | 118.735551 |
4982 | 1.0 | 0.0 | 2 | 0.000000 | 2 | 1.0 | 0.000000 | 1.000000 | 0.500000 | 0.000000 | 0.000000 |
4983 | 1.0 | 0.0 | 1 | 0.000000 | 1 | 1.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4985 | 2.0 | 0.0 | 2 | 0.000000 | 2 | 2.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4986 | 1.0 | 0.0 | 1 | 0.000000 | 1 | 1.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4987 | 1.0 | 1.0 | 1 | 1.000000 | 1 | 1.0 | 0.246574 | 1.000000 | 1.000000 | 50.474694 | 50.474694 |
4988 | 1.0 | 0.0 | 2 | 0.000000 | 2 | 1.0 | 0.000000 | 1.000000 | 0.500000 | 0.000000 | 0.000000 |
4990 | 2.0 | 0.0 | 3 | 0.000000 | 3 | 2.0 | 0.000000 | 1.000000 | 0.666667 | 0.000000 | 0.000000 |
4991 | 1.0 | 0.0 | 1 | 0.000000 | 1 | 1.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4992 | 0.0 | 0.0 | 1 | 0.000000 | 1 | 0.0 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
4993 | 0.0 | 0.0 | 1 | 0.000000 | 1 | 0.0 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
4996 | 1.0 | 0.0 | 1 | 0.000000 | 1 | 1.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
4999 | 1.0 | 0.0 | 1 | 0.000000 | 1 | 1.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
3993 rows × 11 columns
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
features = ['exposure', 'meanViewTime', 'nImps', 'reach', 'reachRate',
'vImps', 'vRate', 'vReach', 'vReachRate']
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
df_pub[features], df_pub.CTR, test_size=0.40, random_state=0)
reg = RandomForestRegressor(n_estimators=100, n_jobs=-1)
model = reg.fit(X_train, y_train)
scores = cross_validation.cross_val_score(model, X_train, y_train)
print(scores, scores.mean())
(array([ 0.6366208 , 0.62995482, 0.69740879]), 0.65466146813267001)
model.score(X_test, y_test)
0.7195525938702716
plt.rcParams["figure.figsize"] = [12.0, 4.0]
plt.bar(range(len(features)), model.feature_importances_, align='center')
_ = plt.xticks(range(len(features)), features)